/**
 * SPDX-FileCopyrightText: Peter Pentchev <roam@ringlet.net>
 * SPDX-License-Identifier: BSD-2-Clause
 */

#define _GNU_SOURCE

#include <err.h>
#include <regex.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <utf8_locale.h>

#define RE_LOCALE \
	"^" \
	"([a-zA-Z0-9]+)" \
	"(" \
		"_" \
		"([a-zA-Z0-9]+)" \
	")?" \
	"(" \
		"\\." \
		"([a-zA-Z0-9-]+)" \
	")?" \
	"(" \
		"@" \
		"([a-zA-Z0-9]+)" \
	")?" \
	"$"

static const char * const utf8_languages[] = {"C", "en", "de", "es", "it", NULL};

static const char * const locale_vars[] = {
    "LC_ALL",
    "LANG",
    "LC_MESSAGES",
    "LC_COLLATE",
    "LC_NAME",
    "LC_IDENTIFICATION",
    "LC_CTYPE",
    "LC_NUMERIC",
    "LC_TIME",
    "LC_MONETARY",
    "LC_PAPER",
    "LC_ADDRESS",
    "LC_TELEPHONE",
    "LC_MEASUREMENT",
    NULL,
};

static regex_t re_locale_;
static bool re_compiled;

extern char ** const environ;

static const regex_t *
build_re_locale(void)
{
	if (!re_compiled) {
		const int res = regcomp(&re_locale_, RE_LOCALE, REG_EXTENDED);
		if (res != 0) {
			const size_t l = regerror(res, &re_locale_, NULL, 0);
			#if __STDC_NO_VLA__
			#error VLA support is missing
			#endif
			char errbuf[l];
			regerror(res, &re_locale_, errbuf, sizeof(errbuf));
			errx(1, "Internal error: regcomp() failed: %s", errbuf);
		}
		re_compiled = true;
	}

	return &re_locale_;
}

static inline size_t
count_strings(const char * const * const arr)
{
	size_t count = 0;
	while (arr[count] != NULL)
		count++;
	return count;
}

static void
free_strings(char ** const arr)
{
	for (char **vptr = arr; *vptr != NULL; vptr++)
		free(*vptr);
	free(arr);
}

static inline bool
has_string(char * const * const haystack, const char * const needle)
{
	for (char * const *vptr = haystack; *vptr != NULL; vptr++)
		if (strcmp(*vptr, needle) == 0)
			return true;
	return false;
}

char *
detect_utf8_locale(const char * const * const pref_languages)
{
	const regex_t * const re_locale = build_re_locale();

	const char * const * const languages =
	    pref_languages != NULL ? pref_languages : utf8_languages;
	const size_t unweight = count_strings(languages);

	char *best_loc = strdup("C");
	size_t best_prio = unweight;

	FILE * const locfile = popen("locale -a", "r");
	if (locfile == NULL) {
		free(best_loc);
		warn("Could not execute `locale -a`");
		return NULL;
	}

	char locname[200];
	while (fgets(locname, sizeof(locname), locfile) != NULL) {
		size_t len = strlen(locname);
		while (len > 0 && (locname[len - 1] == '\r' || locname[len - 1] == '\n'))
			locname[--len] = '\0';

		regmatch_t matches[8];
		if (regexec(re_locale, locname,
		    sizeof(matches) / sizeof(matches[0]), matches, 0) == REG_NOMATCH)
			continue;

		/* Check if the codeset is "utf8" or "UTF-8". */
		if (matches[5].rm_so == -1)
			continue;
		if ((
		      matches[5].rm_eo - matches[5].rm_so != 4 ||
		      strncmp(&locname[matches[5].rm_so], "utf8", 4) != 0
		    ) &&
		    (
		      matches[5].rm_eo - matches[5].rm_so != 5 ||
		      strncmp(&locname[matches[5].rm_so], "UTF-8", 5) != 0
		    ))
			continue;

		/* Temporarily null-terminate the language name. */
		const char save_sep = locname[matches[1].rm_eo];
		locname[matches[1].rm_eo] = '\0';

		for (size_t prio = 0; prio < best_prio && languages[prio] != NULL; prio++) {
			if (strcmp(locname, languages[prio]) == 0) {
				locname[matches[1].rm_eo] = save_sep;
				if (prio < best_prio) {
					best_prio = prio;
					free(best_loc);
					best_loc = strdup(locname);
				}
				break;
			}
		}
		if (best_prio == 0)
			break;
	}

	const bool failed = ferror(locfile);
	if (failed)
		warn("Could not read the output of `locale -a`");
	const int status = pclose(locfile);
	if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
		warn("`locale -a` failed");
		free(best_loc);
		return NULL;
	}
	if (failed) {
		free(best_loc);
		return NULL;
	}

	return best_loc;
}

char **
get_utf8_vars(const char * const * const languages)
{
	char * const locname = detect_utf8_locale(languages);
	if (locname == NULL)
		return NULL;

	char ** const vars = calloc(3, sizeof(*vars));
	if (vars == NULL) {
		warn("Could not allocate memory for 3 environment variables");
		free(locname);
		return NULL;
	}

	if (asprintf(&vars[0], "LC_ALL=%s", locname) == -1) {
		warn("Could not build an LC_ALL=... string");
		free(vars);
		free(locname);
		return NULL;
	}
	free(locname);

	if (asprintf(&vars[1], "LANGUAGE=") == -1) {
		warn("Could not build a LANGUAGE= string");
		free(vars);
		return NULL;
	}

	return vars;
}

static size_t
find_env_var(const char * const evar, char ** const vars)
{
	const char * const eq = strchr(evar, '=');
	if (eq == NULL)
		return count_strings((const char * const *)vars);
	const size_t ofs = eq - evar + 1;

	size_t vidx = 0;
	for (char **var = vars; *var != NULL; var++, vidx++)
		if (strncmp(evar, *var, ofs) == 0)
			return vidx;
	return vidx;
}

static void
free_strings_not_found(char ** const vars, bool * const vfound)
{
	bool *found = vfound;
	for (char **var = vars; *var != NULL; var++, found++)
		if (!*found)
			free(*var);
	free(vars);
	free(vfound);
}

char **
get_utf8_env(const char * const * const languages)
{
	char ** const vars = get_utf8_vars(languages);
	if (vars == NULL)
		return NULL;
	const size_t vcount = count_strings((const char * const *)vars);
	bool * const vfound = calloc(vcount, sizeof(*vfound));
	if (vfound == NULL) {
		warn("Could not allocate room for %zu boolean flags", vcount);
		free_strings(vars);
		return NULL;
	}

	const size_t ecount = count_strings((const char * const *)environ);
	const size_t allocated = ecount + 3;
	char ** const res = calloc(allocated, sizeof(*res));
	if (res == NULL) {
		warn("Could not allocate room for %zu environment variables", ecount + 3);
		free_strings(vars);
		free(vfound);
		return NULL;
	}

	size_t next = 0;
	for (char **env = environ; *env != NULL; env++) {
		char * const evar = *env;
		const size_t found = find_env_var(evar, vars);
		if (found == vcount) {
			char * const copy = strdup(evar);
			if (copy == NULL) {
				warn("Could not copy an environment variable");
				free_strings_not_found(vars, vfound);
				free_strings(res);
				return NULL;
			}
			res[next++] = copy;
		} else if (!vfound[found]) {
			res[next++] = vars[found];
			vfound[found] = true;
		} else {
			warnx("Setting an environment variable twice: %s", evar);
			continue;
		}

		if (next >= allocated) {
			warn("The environment changed while we were processing it");
			free_strings_not_found(vars, vfound);
			free_strings(res);
			return NULL;
		}
	}

	for (size_t vidx = 0; vidx < vcount; vidx++) {
		if (vfound[vidx])
			continue;
		res[next++] = vars[vidx];
		vfound[vidx] = true;

		if (next >= allocated) {
			warn("The environment changed while we were processing it");
			free_strings_not_found(vars, vfound);
			free_strings(res);
			return NULL;
		}
	}

	free(vars);
	free(vfound);
	return res;
}

char **
get_preferred_languages(void)
{
	const regex_t * const re_locale = build_re_locale();

	const size_t allocated = count_strings(locale_vars) + 2;
	char **languages = calloc(allocated, sizeof(*languages));
	if (languages == NULL) {
		warn("Could not allocate memory for %zu language strings", allocated);
		return NULL;
	}
	
	size_t count = 0;
	for (const char * const *lvar = locale_vars; *lvar != NULL; lvar++) {
		const char * const value = getenv(*lvar);
		if (value == NULL)
			continue;

		regmatch_t matches[8];
		if (regexec(re_locale, value,
		    sizeof(matches) / sizeof(matches[0]), matches, 0) == REG_NOMATCH)
			continue;

		/* Check if the codeset is "utf8" or "UTF-8". */
		if (matches[5].rm_so == -1)
			continue;
		if ((
		      matches[5].rm_eo - matches[5].rm_so != 4 ||
		      strncmp(&value[matches[5].rm_so], "utf8", 4) != 0
		    ) &&
		    (
		      matches[5].rm_eo - matches[5].rm_so != 5 ||
		      strncmp(&value[matches[5].rm_so], "UTF-8", 5) != 0
		    ))
			continue;

		char * const lang = strndup(value, matches[1].rm_eo);
		if (has_string(languages, lang))
			free(lang);
		else
			languages[count++] = lang;
	}

	if (!has_string(languages, "C"))
		languages[count++] = strdup("C");

	if (count + 1 < allocated) {
		char ** const nlang = realloc(languages, (count + 1) * sizeof(*languages));
		if (nlang == NULL) {
			warn("Could not reallocate memory for %zu language names", count + 1);
			free_strings(languages);
			return NULL;
		}
		languages = nlang;
	}
	return languages;
}
